Setup the Environment

CSS Styling
writeLines("td, th { padding : 3px } th { background-color:white; color:black; border:1px solid black; text-align:center } td {color:black; border:1px solid black; word-wrap:break-word; white-space:nowrap; overflow: hidden; text-overflow: ellipsis; max-width:300px; text-align:left}", con= "../css/style.css")

Visualization of Metabolites Between Replicate Pairs

Boxplots (Original Abundances)

# Plot Boxplots faceted by shared metabolites
################################################################################
p <- plot_df %>%
  ggplot(aes(y = METABOLITE_NAME, x = VALUE, color = REPLICATE)) +
  geom_boxplot(alpha = 0.8) +
  labs(title=paste0(tissue,' ',study_institute_metab_family,": Original Metabolite Abundances"),
             x = "Abundance", y = "") 
plot(p)

Zero/Missing Values in Metabolites (Original Abundances)

# Identify Zero/Missing Values in Metabolites
################################################################################
na_df <- plot_df %>%
  ungroup() %>% 
  group_by(METABOLITE_NAME, REPLICATE) %>%
  mutate(ZERO_LOG = ifelse(VALUE == 0, 1, 0)) %>%
  mutate(ZERO_N = sum(ZERO_LOG)) %>%
  mutate(ZERO_FREQ = round(ZERO_N/length(shared_sams), digits = 2)) %>%
  mutate(NA_LOG = ifelse(is.na(VALUE), 1, 0)) %>%
  mutate(NA_N = sum(NA_LOG)) %>%
  mutate(NA_FREQ = round(NA_N/length(shared_sams), digits = 2)) %>%
  select(REPLICATE, METABOLITE_NAME, ZERO_N, ZERO_FREQ, NA_N, NA_FREQ) %>%
  unique() %>%
  arrange(REPLICATE, desc(ZERO_FREQ), desc(NA_FREQ)) %>%
  ungroup()

na_df %>%
  knitr::kable(format = "html") %>%
  scroll_box(width = "100%", height = "400px")
REPLICATE METABOLITE_NAME ZERO_N ZERO_FREQ NA_N NA_FREQ
811 Aminoisobutyric acid 77 1.00 0 0
811 Homocysteine 75 0.97 0 0
811 5-Hydroxylysine 67 0.87 0 0
811 3-Methylhistidine 56 0.73 0 0
811 Cysteine 53 0.69 0 0
811 Sarcosine 35 0.45 0 0
811 Cystathionine 10 0.13 0 0
811 1-Methylhistidine 9 0.12 0 0
811 beta-Alanine 2 0.03 0 0
811 Aminoadipic acid 1 0.01 0 0
811 Alanine 0 0.00 0 0
811 2-Aminobutyric acid 0 0.00 0 0
811 Anserine 0 0.00 0 0
811 Arginine 0 0.00 0 0
811 Asparagine 0 0.00 0 0
811 Aspartic acid 0 0.00 0 0
811 Carnosine 0 0.00 0 0
811 Citrulline 0 0.00 0 0
811 Ethanolamine 0 0.00 0 0
811 gamma-Aminobutyric acid 0 0.00 0 0
811 Glutamic acid 0 0.00 0 0
811 Glutamine 0 0.00 0 0
811 Glycine 0 0.00 0 0
811 Histidine 0 0.00 0 0
811 Hydroxyproline 0 0.00 0 0
811 Isoleucine 0 0.00 0 0
811 Leucine 0 0.00 0 0
811 Lysine 0 0.00 0 0
811 Methionine 0 0.00 0 0
811 Ornithine 0 0.00 0 0
811 Phenylalanine 0 0.00 0 0
811 Phosphoethanolamine 0 0.00 0 0
811 Proline 0 0.00 0 0
811 Serine 0 0.00 0 0
811 Taurine 0 0.00 0 0
811 Threonine 0 0.00 0 0
811 Tryptophan 0 0.00 0 0
811 Tyrosine 0 0.00 0 0
811 Valine 0 0.00 0 0
812 Aminoisobutyric acid 77 1.00 0 0
812 Homocysteine 72 0.94 0 0
812 5-Hydroxylysine 67 0.87 0 0
812 Cysteine 58 0.75 0 0
812 3-Methylhistidine 47 0.61 0 0
812 Sarcosine 34 0.44 0 0
812 1-Methylhistidine 13 0.17 0 0
812 Cystathionine 11 0.14 0 0
812 beta-Alanine 2 0.03 0 0
812 Alanine 0 0.00 0 0
812 Aminoadipic acid 0 0.00 0 0
812 2-Aminobutyric acid 0 0.00 0 0
812 Anserine 0 0.00 0 0
812 Arginine 0 0.00 0 0
812 Asparagine 0 0.00 0 0
812 Aspartic acid 0 0.00 0 0
812 Carnosine 0 0.00 0 0
812 Citrulline 0 0.00 0 0
812 Ethanolamine 0 0.00 0 0
812 gamma-Aminobutyric acid 0 0.00 0 0
812 Glutamic acid 0 0.00 0 0
812 Glutamine 0 0.00 0 0
812 Glycine 0 0.00 0 0
812 Histidine 0 0.00 0 0
812 Hydroxyproline 0 0.00 0 0
812 Isoleucine 0 0.00 0 0
812 Leucine 0 0.00 0 0
812 Lysine 0 0.00 0 0
812 Methionine 0 0.00 0 0
812 Ornithine 0 0.00 0 0
812 Phenylalanine 0 0.00 0 0
812 Phosphoethanolamine 0 0.00 0 0
812 Proline 0 0.00 0 0
812 Serine 0 0.00 0 0
812 Taurine 0 0.00 0 0
812 Threonine 0 0.00 0 0
812 Tryptophan 0 0.00 0 0
812 Tyrosine 0 0.00 0 0
812 Valine 0 0.00 0 0
# Remove Metabolites with high NAor Zero Frequencies
################################################################################
met_rm <- na_df %>%
  filter((ZERO_FREQ >= 0.95) | (NA_FREQ >= 0.8)) %>%
  select(METABOLITE_NAME) %>% unlist() %>% unique()
metabolites <- metabolites[metabolites %!in% met_rm]
plot_df <- plot_df %>%
  filter(METABOLITE_NAME %in% metabolites)
plot_df$METABOLITE_NAME <- as.character(plot_df$METABOLITE_NAME)

Distributions (Original Abundances)

# Summary Statistics of Density plot distribution
################################################################################
# Iterate through the runs and collect vectors
df_all <- data.frame()
for(rep in c(reps)){
  # Collect numeric vector
  num_vec <- plot_df %>%
    filter(REPLICATE == rep) %>%
    select(VALUE) %>% unlist() %>% unname()
  df <- NumericSummaryStats(num_vec)
  row.names(df) <- rep
  df_all <- rbind(df_all, df)
}
df_all %>%
  knitr::kable(format = "html") %>%
  scroll_box(width = "100%", height = "200px")
NA_COUNT NA_FREQ MEAN MEDIAN MAX MIN MID_RANGE VARIANCE STD_DEV SE_MEAN Q1 Q3 KURTOSIS SKEW BC_LAMBDA
811 0 0 4.54 0.46 116.99 0 116.99 178.31 13.35 0.25 0.09 1.43 16.57 4 None
812 0 0 4.44 0.45 108.75 0 108.75 169.99 13.04 0.24 0.08 1.38 16.65 4 None
# Plot the density plot for all the gene counts
################################################################################
plot_df %>%
  ggplot(aes(x = VALUE, color = REPLICATE)) +
  geom_density() +
  xlim(min(df_all$MIN), mean(df_all$Q3)) +
  labs(title=paste0(tissue,' ',study_institute_metab_family,": Original Metabolite Abundances"),
       x = "Abundance",
       y = "Density")

Metabolite by Metabolite Correlation (Original Abundances)

################################################################################
####################### Metabolite by Metabolite Correlations ##################
################################################################################
# Subset Matrices
x <- plot_df %>%
  unite(METABOLITE_NAME,REPLICATE, 
        col = METABOLITE_NAME_REPLICATE, remove = F) %>%
  select(METABOLITE_NAME_REPLICATE, labelid, VALUE,REPLICATE) %>%
  split(.$REPLICATE) %>%
  map(select, -REPLICATE) %>%
  map(pivot_wider, names_from = METABOLITE_NAME_REPLICATE, values_from = VALUE) %>%
  map(arrange, labelid) %>%
  map(tibble::column_to_rownames, var = "labelid") %>%
  map(as.matrix)

# Subset matrices
if(all(row.names(x[[1]]) == row.names(x[[2]]))){
  shared_met_mat <-  do.call(cbind,x) 
}

Corr <- 'Spearman'
# Create an NxN Matrix of correlation
if(Corr == 'Pearson'){
        cor1 <- stats::cor(shared_met_mat, method = 'pearson') %>% round(digits = 3)
}else if(Corr == 'Spearman'){
        cor1 <- stats::cor(shared_met_mat, method = 'spearman') %>% round(digits = 3)
}


# Reorder the correlation matrix
cormat <- cor1

# Melt the correlation matrix
melted_cormat <- melt(cormat, na.rm = TRUE)

################################################################################
####################### Only Reciprocal Pairs ##################################
################################################################################

# Remove rows that compare metabolites from the same dataset, them remove redundent measurements
melted_cormat2 <- melted_cormat 
# Widen the datafrmae back to matrix
cormat2 <- melted_cormat2 %>%
  pivot_wider(names_from = Var2, values_from = value) %>%
  as.data.frame()
row.names(cormat2) <- cormat2$Var1
cormat2 <- cormat2 %>%
  select(-Var1) %>%
  as.matrix()
cormat2 <- cormat2[sort(rownames(cormat2)),sort(colnames(cormat2))]
cormat2 <- cormat2[grepl(reps[1], rownames(cormat2)),grepl(reps[2], colnames(cormat2))]

# Orient the colors and breaks
paletteLength <- 1000
myColor <- colorRampPalette(c("blue", "white", "red"))(paletteLength)
# length(breaks) == length(paletteLength) + 1
# use floor and ceiling to deal with even/odd length pallettelengths
myBreaks <- c(seq(-1, 0, length.out=ceiling(paletteLength/2) + 1), 
              seq(1/paletteLength, max(cor1, na.rm = T), length.out=floor(paletteLength/2)))

# Plot the heatmap
pheatmap(cormat2, color=myColor, breaks=myBreaks, 
                  cluster_rows = F, cluster_cols = F,fontsize = 6)

Scatterplots (Original Abundances)

# Subset the data for sites to compare (scatterplots)
###################################################
plot_join_df <- countdata_df %>%
  ungroup() %>%
  unite(STUDY_INSTITUTE,METAB_FAMILY, col = "STUDY_INSTITUTE_METAB_FAMILY") %>%
  filter(TISSUE == tissue) %>%
  filter(NAMED == named) %>%
  filter(STUDY_INSTITUTE_METAB_FAMILY == study_institute_metab_family) %>%
  unnest(COUNT_DATA) %>%
  filter(viallabel %in% as.character(unlist(sample_join$sample_id))) %>%
  filter(METABOLITE_NAME %in% metabolites) %>%
  unite(labelid, viallabel, col = "labelid_viallabel", remove = F) %>%
  mutate(REPLICATE = case_when(grepl(paste0(reps[1],"$"),labelid_viallabel) ~ reps[1],
                              grepl(paste0(reps[2],"$"),labelid_viallabel) ~ reps[2],
                              )) %>%
  select(REPLICATE,METABOLITE_NAME,labelid,VALUE) %>%
  mutate(REPLICATE = as.character(REPLICATE)) %>%
  split(.$REPLICATE) %>%
  map(~rename(., !!sym(unique(.$REPLICATE)) := "VALUE")) %>%
  map(~select(., -REPLICATE)) %>%
  purrr::reduce(left_join, by = c("labelid","METABOLITE_NAME"))

# Plot scatter plots
################################################################################
# Plot scatter plots ordered by sample (include R2 values)
lm_df <- plot_join_df %>%
  rename(name = METABOLITE_NAME) %>%
  rename(x = reps[1]) %>%
  rename(y = reps[2])
# Iterate through each of the shared metabolites to collect summary info about their correlations
r2_df <- data.frame()
for(metab in metabolites){
  met_df <- lm_df %>%
    filter(name == metab)
  r2_df <- rbind(r2_df, lm_eqn(met_df))
}
# Display summaries
met_r2_df <- r2_df %>%
  arrange(METABOLITE) %>%
  select(METABOLITE,R2) %>%
  arrange(desc(R2))
met_r2_df %>%
  knitr::kable(format = "html") %>%
  scroll_box(width = "100%", height = "200px")
METABOLITE R2
2-Aminobutyric acid 0.92
Alanine 0.746
Tyrosine 0.711
Citrulline 0.61
Ornithine 0.605
Threonine 0.594
Aspartic acid 0.584
Serine 0.58
Isoleucine 0.55
Asparagine 0.541
Proline 0.54
Hydroxyproline 0.508
Lysine 0.474
Tryptophan 0.427
Carnosine 0.376
Valine 0.375
Sarcosine 0.366
Glutamic acid 0.365
Leucine 0.313
Methionine 0.296
Anserine 0.282
Ethanolamine 0.269
Arginine 0.246
Glutamine 0.202
Glycine 0.173
1-Methylhistidine 0.171
Histidine 0.169
Cystathionine 0.0967
Taurine 0.0812
Aminoadipic acid 0.0798
beta-Alanine 0.0575
Phenylalanine 0.0546
Phosphoethanolamine 0.0281
3-Methylhistidine 0.00644
gamma-Aminobutyric acid 0.00488
Cysteine 0.00142
5-Hydroxylysine 0.00046
# Collect the order of metabolites
scat_met_order <- met_r2_df %>% select(METABOLITE) %>% unlist() %>% as.character()

# Plot the scatter plots
for(metab in scat_met_order){
  p <- plot_join_df %>%
    filter(METABOLITE_NAME == metab) %>%
  ggplot(aes(x = !!sym(reps[1]), y = !!sym(reps[2])), color = ) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = "lm", size = 1) +
  geom_abline(linetype="dashed") +
  facet_wrap(~ METABOLITE_NAME) +
  expand_limits(x = 0, y = 0) +
  #labs(title=paste0(tissue,' ',study_institute_metab_family,": Normalized Metabolite Abundances")) +
  coord_fixed(ratio=1)
  plot(p)
}

Sample by Sample Correlations (Original Abundances)

################################################################################
####################### Sample by Sample Correlations ##########################
################################################################################

# Subset Data
################################################################################
# Original NxP Matrix
x <- plot_df %>%
  unite(labelid,REPLICATE, 
        col = labelid_REPLICATE, remove = F) %>%
  select(labelid_REPLICATE, METABOLITE_NAME, VALUE,REPLICATE) %>%
  split(.$REPLICATE) %>%
  map(select, -REPLICATE) %>%
  map(pivot_wider, names_from = METABOLITE_NAME, values_from = VALUE) %>%
  map(tibble::column_to_rownames, var = "labelid_REPLICATE") %>%
  map(as.matrix)
# Subset matrices
shared_sam_mat <-  do.call(rbind,x) %>% 
  t()

# Create an NxN Matrix of correlation
Corr <- 'Spearman'
if(Corr == 'Pearson'){
        cor1 <- stats::cor(shared_sam_mat, method = 'pearson') %>% round(digits = 3)
}else if(Corr == 'Spearman'){
        cor1 <- stats::cor(shared_sam_mat, method = 'spearman') %>% round(digits = 3)
}

# Melt the correlation matrix
melted_cormat <- melt(cor1, na.rm = TRUE)

################################################################################
####################### Unclustered ############################################
################################################################################

# Remove rows that compare metabolites from the same dataset, them remove redundent measurements
melted_cormat2 <- melted_cormat

# Widen the dataframwe back to matrix
cormat2 <- melted_cormat2 %>%
  pivot_wider(names_from = Var2, values_from = value) %>%
  as.data.frame()
row.names(cormat2) <- cormat2$Var1
cormat2 <- cormat2 %>%
  select(-Var1) %>%
  as.matrix()
cormat2 <- cormat2[sort(rownames(cormat2)),sort(colnames(cormat2))]
cormat2 <- cormat2[grepl(reps[1], rownames(cormat2)),grepl(reps[2], colnames(cormat2))]

# Orient the colors and breaks
paletteLength <- 1000
myColor <- colorRampPalette(c("blue", "white", "red"))(paletteLength)
# length(breaks) == length(paletteLength) + 1
# use floor and ceiling to deal with even/odd length pallettelengths
myBreaks <- c(seq(-1, 0, length.out=ceiling(paletteLength/2) + 1), 
              seq(1/paletteLength, max(cor1, na.rm = TRUE), length.out=floor(paletteLength/2)))

# Plot the heatmap
pheatmap(as.matrix(cormat2), color=myColor, breaks=myBreaks, 
                  cluster_rows = F, cluster_cols = F,fontsize = 6)

PCA Analysis (Original Abundances)

pca <- prcomp(na.omit(t(shared_sam_mat)), scale. = F)
percentVar <- pca$sdev^2/sum(pca$sdev^2)
PC <- pca$x %>% as.data.frame()
PC$labelid_rep <- as.character(row.names(pca$x))
PC <- PC %>%
  tidyr::separate(col = labelid_rep, into = c('labelid', 'REPLICATE'),
  sep = "_", remove = T)
# Join the phenotype data
PC <- left_join(PC, pheno_df, by = 'labelid')

p0 <- PC %>%
  ggplot(aes(x = PC1, y = PC2, color=REPLICATE)) +
  geom_point(size = 3) +
  geom_line(aes(group = labelid),color="dark grey") +
  ggtitle('Shared Samples and Shared Metabolites') + 
  xlab(paste0('PC1: ',round(percentVar[1]*100,2),'% variance')) +
  ylab(paste0('PC2: ',round(percentVar[2]*100,2),'% variance')) +
  theme(aspect.ratio=1)
p0

p1 <- PC %>%
        ggplot(aes(x = PC1, y = PC2, color=Key.anirandgroup, shape = REPLICATE)) +
        geom_point(size = 3) +
  geom_line(aes(group = labelid),color="dark grey") +
        ggtitle('Shared Samples and Shared Metabolites') + 
        xlab(paste0('PC1: ',round(percentVar[1]*100,2),'% variance')) +
        ylab(paste0('PC2: ',round(percentVar[2]*100,2),'% variance')) +
        scale_color_manual(values=ec_colors) +
        theme(aspect.ratio=1)
p1

PC$Registration.sex <- as.character(PC$Registration.sex)
p2 <- PC %>%
        ggplot(aes(x = PC1, y = PC2, color=Registration.sex, shape = REPLICATE)) +
        geom_point(size = 3) +
  geom_line(aes(group = labelid),color="dark grey") +
        ggtitle('Shared Samples and Shared Metabolites') + 
        xlab(paste0('PC1: ',round(percentVar[1]*100,2),'% variance')) +
        ylab(paste0('PC2: ',round(percentVar[2]*100,2),'% variance')) +
        theme(aspect.ratio=1)
p2

Normalization (AutoScale)

# Normalize
# Note in this step, we use labelid to account for mayo's duplicate samples
################################################################################
norm_df <- plot_df %>%
  unite(labelid, viallabel, col = "labelid_viallabel", remove = F) %>%
  split(.$REPLICATE) %>%
  map(select, labelid_viallabel, METABOLITE_NAME, VALUE) %>%
  map(pivot_wider, names_from = METABOLITE_NAME, values_from = VALUE) %>%
  map(column_to_rownames, var = "labelid_viallabel") %>%
  map(as.matrix) %>%
  map(AutoScaleMatrix) %>%
  map(as.data.frame) %>%
  map(rownames_to_column, var = "labelid_viallabel") %>%
  map(pivot_longer, names_to = 'METABOLITE_NAME', values_to = 'VALUE', cols = all_of(metabolites)) %>%
  map(mutate, REPLICATE = case_when(grepl(paste0(reps[1],"$"),labelid_viallabel) ~ reps[1],
                                     grepl(paste0(reps[2],"$"),labelid_viallabel) ~ reps[2],
                                    grepl(paste0(reps[3],"$"),labelid_viallabel) ~ reps[3])) %>%
  bind_rows()

Boxplots (Normalized Abundances)

# Plot Boxplots faceted by shared metabolites
################################################################################
norm_df %>%
  ggplot(aes(y = METABOLITE_NAME, x = VALUE, color = REPLICATE)) +
  geom_boxplot(alpha = 0.8) +
  labs(title=paste0(tissue,' ',study_institute_metab_family,": Normalized Metabolite Abundances"),
             x = "Abundance", y = "") 

Metabolite by Metabolite Correlation (Normalized Abundances)

################################################################################
####################### Metabolite by Metabolite Correlations ##################
################################################################################
# Subset Matrices
x <- plot_df %>%
  unite(METABOLITE_NAME,REPLICATE, 
        col = METABOLITE_NAME_REPLICATE, remove = F) %>%
  select(METABOLITE_NAME_REPLICATE, labelid, VALUE,REPLICATE) %>%
  split(.$REPLICATE) %>%
  map(select, -REPLICATE) %>%
  map(pivot_wider, names_from = METABOLITE_NAME_REPLICATE, values_from = VALUE) %>%
  map(arrange, labelid) %>%
  map(tibble::column_to_rownames, var = "labelid") %>%
  map(as.matrix) %>%
  map(AutoScaleMatrix)

# Subset matrices
if(all(row.names(x[[1]]) == row.names(x[[2]]))){
  shared_met_mat <-  do.call(cbind,x) 
}

Corr <- 'Spearman'
# Create an NxN Matrix of correlation
if(Corr == 'Pearson'){
        cor1 <- stats::cor(shared_met_mat, method = 'pearson') %>% round(digits = 3)
}else if(Corr == 'Spearman'){
        cor1 <- stats::cor(shared_met_mat, method = 'spearman') %>% round(digits = 3)
}

# Reorder the correlation matrix
cormat <- cor1

# Melt the correlation matrix
melted_cormat <- melt(cormat, na.rm = TRUE)

################################################################################
####################### Only Reciprocal Pairs ##################################
################################################################################

# Remove rows that compare metabolites from the same dataset, them remove redundent measurements
melted_cormat2 <- melted_cormat 
# Widen the datafrmae back to matrix
cormat2 <- melted_cormat2 %>%
  pivot_wider(names_from = Var2, values_from = value) %>%
  as.data.frame()
row.names(cormat2) <- cormat2$Var1
cormat2 <- cormat2 %>%
  select(-Var1) %>%
  as.matrix()
cormat2 <- cormat2[sort(rownames(cormat2)),sort(colnames(cormat2))]
cormat2 <- cormat2[grepl(reps[1], rownames(cormat2)),grepl(reps[2], colnames(cormat2))]

# Orient the colors and breaks
paletteLength <- 1000
myColor <- colorRampPalette(c("blue", "white", "red"))(paletteLength)
# length(breaks) == length(paletteLength) + 1
# use floor and ceiling to deal with even/odd length pallettelengths
myBreaks <- c(seq(-1, 0, length.out=ceiling(paletteLength/2) + 1), 
              seq(1/paletteLength, max(cor1, na.rm = T), length.out=floor(paletteLength/2)))

# Plot the heatmap
pheatmap(cormat2, color=myColor, breaks=myBreaks, 
                  cluster_rows = F, cluster_cols = F,fontsize = 6)

Scatterplots (Normalized Abundances)

# Normalize
# Subset the data for sites to compare (scatterplots)
###################################################
norm_join_df <- countdata_df %>%
  ungroup() %>%
  unite(STUDY_INSTITUTE,METAB_FAMILY, col = "STUDY_INSTITUTE_METAB_FAMILY") %>%
  filter(TISSUE == tissue) %>%
  filter(NAMED == named) %>%
  filter(STUDY_INSTITUTE_METAB_FAMILY == study_institute_metab_family) %>%
  unnest(COUNT_DATA) %>%
  filter(viallabel %in% as.character(unlist(sample_join$sample_id))) %>%
  unite(labelid, viallabel, col = "labelid_viallabel", remove = F) %>%
  mutate(REPLICATE = case_when(grepl(paste0(reps[1],"$"),labelid_viallabel) ~ reps[1],
                                     grepl(paste0(reps[2],"$"),labelid_viallabel) ~ reps[2],
                               grepl(paste0(reps[3],"$"),labelid_viallabel) ~ reps[3])) %>%
  split(.$REPLICATE) %>%
  map(select, labelid_viallabel, METABOLITE_NAME, VALUE) %>%
  map(pivot_wider, names_from = METABOLITE_NAME, values_from = VALUE) %>%
  map(column_to_rownames, var = "labelid_viallabel") %>%
  map(as.matrix) %>%
  map(AutoScaleMatrix) %>%
  map(as.data.frame) %>%
  map(rownames_to_column, var = "labelid_viallabel") %>%
  map(pivot_longer, names_to = 'METABOLITE_NAME', values_to = 'VALUE', cols = all_of(metabolites)) %>%
  map(mutate, REPLICATE = case_when(grepl(paste0(reps[1],"$"),labelid_viallabel) ~ reps[1],
                                     grepl(paste0(reps[2],"$"),labelid_viallabel) ~ reps[2],
                                    grepl(paste0(reps[3],"$"),labelid_viallabel) ~ reps[3])) %>%
  map(~rename(., !!sym(unique(.$REPLICATE)) := "VALUE")) %>%
  map(~select(., -REPLICATE)) %>%
  map(mutate, labelid = gsub(pattern = "_..*", replacement = '', labelid_viallabel)) %>%
  map(select, -labelid_viallabel) %>%
  purrr::reduce(left_join, by = c("labelid","METABOLITE_NAME"))

# Plot scatter plots ordered by sample (include R2 values)
lm_df <- norm_join_df %>%
  rename(name = METABOLITE_NAME) %>%
  rename(x = reps[1]) %>%
  rename(y = reps[2])
# Iterate through each of the shared metabolites to collect summary info about their correlations
r2_df <- data.frame()
for(metab in metabolites){
  met_df <- lm_df %>%
    filter(name == metab)
  r2_df <- rbind(r2_df, lm_eqn(met_df))
}
# Display summaries
met_r2_df <- r2_df %>%
  arrange(METABOLITE) %>%
  select(METABOLITE,R2) %>%
  arrange(desc(R2))
met_r2_df %>%
  knitr::kable(format = "html") %>%
  scroll_box(width = "100%", height = "200px")
METABOLITE R2
2-Aminobutyric acid 0.92
Alanine 0.746
Tyrosine 0.711
Citrulline 0.61
Ornithine 0.605
Threonine 0.594
Aspartic acid 0.584
Serine 0.58
Isoleucine 0.55
Asparagine 0.541
Proline 0.54
Hydroxyproline 0.508
Lysine 0.474
Tryptophan 0.427
Carnosine 0.376
Valine 0.375
Sarcosine 0.366
Glutamic acid 0.365
Leucine 0.313
Methionine 0.296
Anserine 0.282
Ethanolamine 0.269
Arginine 0.246
Glutamine 0.202
Glycine 0.173
1-Methylhistidine 0.171
Histidine 0.169
Cystathionine 0.0967
Taurine 0.0812
Aminoadipic acid 0.0798
beta-Alanine 0.0575
Phenylalanine 0.0546
Phosphoethanolamine 0.0281
3-Methylhistidine 0.00644
gamma-Aminobutyric acid 0.00488
Cysteine 0.00142
5-Hydroxylysine 0.00046
# Collect the order of metabolites
scat_met_order <- met_r2_df %>% select(METABOLITE) %>% unlist() %>% as.character()

# Plot scatter plots (Normalized)
################################################################################
# Plot the scatter plots
for(metab in scat_met_order){
  p <- norm_join_df %>%
    filter(METABOLITE_NAME == metab) %>%
  ggplot(aes(x = !!sym(reps[1]), y = !!sym(reps[2])), color = ) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = "lm", size = 1) +
  geom_abline(linetype="dashed") +
  facet_wrap(~ METABOLITE_NAME) +
  expand_limits(x = 0, y = 0) +
  #labs(title=paste0(tissue,' ',study_institute_metab_family,": Normalized Metabolite Abundances")) +
  coord_fixed(ratio=1)
  plot(p)
}

# norm_join_df %>%
#   mutate(METABOLITE_NAME = factor(METABOLITE_NAME, levels = scat_met_order)) %>%
#   ggplot(aes(x = !!sym(reps[1]), y = !!sym(reps[2])), color = ) +
#   geom_point(alpha = 0.5) +
#   geom_smooth(method = "lm", size = 1) +
#   geom_abline(linetype="dashed") +
#   facet_wrap(~ METABOLITE_NAME) +
#   expand_limits(x = 0, y = 0) +
#   labs(title=paste0(tissue,' ',study_institute_metab_family,": Normalized Metabolite Abundances")) +
#   coord_fixed(ratio=1)

Distributions (Normalized Abundances)

# Plot the density plot for all the gene counts
################################################################################
norm_df %>%
  ggplot(aes(x = VALUE, color = REPLICATE)) +
  geom_density() +
  labs(title=paste0(tissue,' ',study_institute_metab_family,": Normalized Metabolite Abundances"),
       x = "Abundance",
       y = "Density")

# Summary Statistics of Density plot distribution
################################################################################
# Iterate through the runs and collect vectors
df_all <- data.frame()
for(rep in reps){
  # Collect numeric vector
  num_vec <- norm_df %>%
    filter(REPLICATE == rep) %>%
    select(VALUE) %>% unlist() %>% unname()
  df <- NumericSummaryStats(num_vec)
  row.names(df) <- rep
  df_all <- rbind(df_all, df)
}
df_all
##     NA_COUNT NA_FREQ MEAN MEDIAN  MAX   MIN MID_RANGE VARIANCE STD_DEV SE_MEAN    Q1   Q3 KURTOSIS SKEW
## 811        0       0    0  -0.14 8.37 -2.70     11.07     0.99    0.99    0.02 -0.66 0.55     2.36 0.86
## 812        0       0    0  -0.12 4.60 -2.67      7.27     0.99    0.99    0.02 -0.71 0.62     0.57 0.54
##     BC_LAMBDA
## 811      None
## 812      None
#}

Sample by Sample Correlations (Normalized Abundances)

################################################################################
####################### Sample by Sample Correlations ##########################
################################################################################

# Subset Data
################################################################################
# Original NxP Matrix
x <- plot_df %>%
  unite(labelid,REPLICATE, 
        col = labelid_REPLICATE, remove = F) %>%
  select(labelid_REPLICATE, METABOLITE_NAME, VALUE,REPLICATE) %>%
  split(.$REPLICATE) %>%
  map(select, -REPLICATE) %>%
  map(pivot_wider, names_from = METABOLITE_NAME, values_from = VALUE) %>%
  map(tibble::column_to_rownames, var = "labelid_REPLICATE") %>%
  map(as.matrix) %>%
  map(AutoScaleMatrix)
# Subset matrices
shared_sam_mat <-  do.call(rbind,x) %>% 
  t()

# Create an NxN Matrix of correlation
Corr <- 'Spearman'
if(Corr == 'Pearson'){
        cor1 <- stats::cor(shared_sam_mat, method = 'pearson') %>% round(digits = 3)
}else if(Corr == 'Spearman'){
        cor1 <- stats::cor(shared_sam_mat, method = 'spearman') %>% round(digits = 3)
}

# Melt the correlation matrix
melted_cormat <- melt(cor1, na.rm = TRUE)

################################################################################
####################### Unclustered ############################################
################################################################################

# Remove rows that compare metabolites from the same dataset, them remove redundent measurements
melted_cormat2 <- melted_cormat

# Widen the dataframwe back to matrix
cormat2 <- melted_cormat2 %>%
  pivot_wider(names_from = Var2, values_from = value) %>%
  as.data.frame()
row.names(cormat2) <- cormat2$Var1
cormat2 <- cormat2 %>%
  select(-Var1) %>%
  as.matrix()
cormat2 <- cormat2[sort(rownames(cormat2)),sort(colnames(cormat2))]
cormat2 <- cormat2[grepl(reps[1], rownames(cormat2)),grepl(reps[2], colnames(cormat2))]

# Orient the colors and breaks
paletteLength <- 1000
myColor <- colorRampPalette(c("blue", "white", "red"))(paletteLength)
# length(breaks) == length(paletteLength) + 1
# use floor and ceiling to deal with even/odd length pallettelengths
myBreaks <- c(seq(-1, 0, length.out=ceiling(paletteLength/2) + 1), 
              seq(1/paletteLength, max(cor1, na.rm = TRUE), length.out=floor(paletteLength/2)))

# Plot the heatmap
pheatmap(as.matrix(cormat2), color=myColor, breaks=myBreaks, 
                  cluster_rows = F, cluster_cols = F,fontsize = 6)

PCA Analysis (Normalized Abundances)

# Plot a PCA with outliers labeled
# shared_sam_mat <- shared_sam_mat %>%
#   t() %>% AutoScaleMatrix() %>%
#   t()
pca <- prcomp(na.omit(t(shared_sam_mat)), scale. = F)
percentVar <- pca$sdev^2/sum(pca$sdev^2)
PC <- pca$x %>% as.data.frame()
PC$labelid_rep <- as.character(row.names(pca$x))
PC <- PC %>%
  tidyr::separate(col = labelid_rep, into = c('labelid', 'REPLICATE'),
  sep = "_", remove = T)
# Join the phenotype data
PC <- left_join(PC, pheno_df, by = 'labelid')

p0 <- PC %>%
  ggplot(aes(x = PC1, y = PC2, color=REPLICATE)) +
  geom_point(size = 3) +
  geom_line(aes(group = labelid),color="dark grey") +
  ggtitle('Shared Samples and Shared Metabolites') + 
  xlab(paste0('PC1: ',round(percentVar[1]*100,2),'% variance')) +
  ylab(paste0('PC2: ',round(percentVar[2]*100,2),'% variance')) +
  theme(aspect.ratio=1)
p0

p1 <- PC %>%
        ggplot(aes(x = PC1, y = PC2, color=Key.anirandgroup, shape = REPLICATE)) +
        geom_point(size = 3) +
  geom_line(aes(group = labelid),color="dark grey") +
        ggtitle('Shared Samples and Shared Metabolites') + 
        xlab(paste0('PC1: ',round(percentVar[1]*100,2),'% variance')) +
        ylab(paste0('PC2: ',round(percentVar[2]*100,2),'% variance')) +
        scale_color_manual(values=ec_colors) +
        theme(aspect.ratio=1)
p1

PC$Registration.sex <- as.character(PC$Registration.sex)
p2 <- PC %>%
        ggplot(aes(x = PC1, y = PC2, color=Registration.sex, shape = REPLICATE)) +
        geom_point(size = 3) +
  geom_line(aes(group = labelid),color="dark grey") +
        ggtitle('Shared Samples and Shared Metabolites') + 
        xlab(paste0('PC1: ',round(percentVar[1]*100,2),'% variance')) +
        ylab(paste0('PC2: ',round(percentVar[2]*100,2),'% variance')) +
        theme(aspect.ratio=1)
p2